import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv('../data/atec_anti_fraud_train.csv')
test = pd.read_csv('../data/atec_anti_fraud_test_a.csv')
sns.countplot(data=train,x='label')
plt.show()
print train.groupby('label').count()['id']
num_col =[]
cat_col = []
cols = [c for c in train.columns if 'f' in c]
for col in cols:
if train[col].dtype =='int64':
cat_col.append(col)
if train[col].dtype =='float64':
num_col.append(col)
for col in num_col:
fig,ax =plt.subplots(2,2,figsize=(10,10))
sns.kdeplot(train[col][train['label']==0],color='r',ax=ax[0][0])
sns.kdeplot(train[col][train['label']==1],color='b',ax=ax[0][1])
sns.kdeplot(train[col][train['label']==-1],color='g',ax=ax[1][0])
sns.kdeplot(train[col],color='y',ax=ax[1][1])
plt.show()
for col in num_col:
fig,ax =plt.subplots(2,2,figsize=(10,10))
train[col][train['label']==0].hist(color='r',ax=ax[0][0])
train[col][train['label']==1].hist(color='b',ax=ax[0][1])
train[col][train['label']==-1].hist(color='g',ax=ax[1][0])
train[col].hist(color='y',ax=ax[1][1])
plt.show()
corr =train[num_col+cat_col].corr()
plt.figure(figsize=(30,30))
sns.heatmap(data=corr)
plt.show()
for col in cat_col:
fig,ax=plt.subplots(2,1)
sns.countplot(data=train[train['label']==-1],x=col,ax=ax[0])
sns.countplot(data=train[train['label']==0],x=col,ax=ax[1])
plt.show()
train['month']=train.date.map(lambda x: str(x)[0:6])
train.groupby('month')['label'].mean()
train['date_str'] = train.date.map(lambda x: str(x))
plt.figure(figsize=(15,8))
train.groupby('date_str')['label'].mean().plot(kind='bar')
plt.show()
train['date_str'] = train.date.map(lambda x: str(x))
plt.figure(figsize=(15,8))
train.groupby('date_str')['label'].count().plot(kind='bar')
plt.show()
train['date_str'] = train.date.map(lambda x: str(x))
plt.figure(figsize=(15,8))
train[train.label==1].groupby('date_str')['label'].count().plot(kind='bar')
plt.show()
train['date_str'] = train.date.map(lambda x: str(x))
plt.figure(figsize=(15,8))
train[train.label==0].groupby('date_str')['label'].count().plot(kind='bar')
plt.show()
train['date_str'] = train.date.map(lambda x: str(x))
plt.figure(figsize=(15,8))
train[train.label==-1].groupby('date_str')['label'].count().plot(kind='bar')
plt.show()
for col in num_col:
fig,ax =plt.subplots(1,2,figsize=(10,10))
sns.kdeplot(train[col],color='r',ax=ax[0])
sns.kdeplot(test[col],color='b',ax=ax[1])
plt.show()
for col in cat_col:
fig,ax=plt.subplots(2,1)
sns.countplot(data=train,x=col,ax=ax[0])
sns.countplot(data=test,x=col,ax=ax[1])
plt.show()